# The tidyverse package contains ggplot2, dplyr, and several other packages we will use
library(tidyverse)
# The gridExtra package contains grid.arrange function used to combine plots in the same window
library(gridExtra)
# The janitor package contains tidyverse functions for cross-tables
library(janitor)
# The knitr package contains some table formating functions
library(knitr)
# The GGally package contains a custom correlation plot we will use
library(GGally)
#The lubridate package contains functions to convert dates
library(lubridate)
#The colorspace package contains color pallates for graph design
library(colorspace)
#The ggthemes package contains themes for graph design
library(ggthemes)
#Rhe dpylr package helps with verbs for data manipulation
library(dplyr)
#contains formatting for dollar values on graphs
library(scales)
#Load data
om <- read.csv("mtp_off_mate.csv")
#View the data
head(om)
## Order.ID Order.Date Ship.Date Ship.Mode Customer.ID
## 1 CA-2014-152156 11/9/2014 11/12/2014 Second Class CG-12520
## 2 CA-2014-152156 11/9/2014 11/12/2014 Second Class CG-12520
## 3 CA-2014-138688 6/13/2014 6/17/2014 Second Class DV-13045
## 4 US-2013-108966 10/11/2013 10/18/2013 Standard Class SO-20335
## 5 US-2013-108966 10/11/2013 10/18/2013 Standard Class SO-20335
## 6 CA-2012-115812 6/9/2012 6/14/2012 Standard Class BH-11710
## City State Postal.Code Region Product.ID
## 1 Henderson Kentucky 42420 South FUR-BO-10001798
## 2 Henderson Kentucky 42420 South FUR-CH-10000454
## 3 Los Angeles California 90036 West OFF-LA-10000240
## 4 Fort Lauderdale Florida 33311 South FUR-TA-10000577
## 5 Fort Lauderdale Florida 33311 South OFF-ST-10000760
## 6 Los Angeles California 90032 West FUR-FU-10001487
## Product.Name Segment
## 1 Bush Somerset Collection Bookcase Consumer
## 2 Hon Deluxe Fabric Upholstered Stacking Chairs, Rounded Back Consumer
## 3 Self-Adhesive Address Labels for Typewriters by Universal Corporate
## 4 Bretford CR4500 Series Slim Rectangular Table Consumer
## 5 Eldon Fold 'N Roll Cart System Consumer
## 6 Eldon Expressions Wood and Plastic Desk Accessories, Cherry Wood Consumer
## Category Sub.Category Revenue Quantity Discount Profit
## 1 Furniture Bookcases 261.9600 2 0.00 41.91
## 2 Furniture Chairs 731.9400 3 0.00 219.58
## 3 Office Supplies Labels 14.6200 2 0.00 6.87
## 4 Furniture Tables 957.5775 5 0.45 -383.03
## 5 Office Supplies Storage 22.3680 2 0.20 2.52
## 6 Furniture Furnishings 48.8600 7 0.00 14.17
Comments
There is a lot of factor data here
-not wise to overcrowd analysis with factor variables with 5+ levels -may have to look into splitting up/ranking data for better analysis
Data appears tidy and very normal
Number of observations is relatively small
Questions
Will some of these factors serve use better as quantitative variables?
-In order to get different set of infromation out of them. -Zipcode would be one such variable.
#Check the structure of the data
str(om)
## 'data.frame': 9994 obs. of 18 variables:
## $ Order.ID : Factor w/ 5009 levels "CA-2012-100006",..: 2501 2501 2297 4373 4373 202 202 202 202 202 ...
## $ Order.Date : Factor w/ 1237 levels "1/1/2015","1/10/2012",..: 309 309 841 94 94 923 923 923 923 923 ...
## $ Ship.Date : Factor w/ 1334 levels "1/1/2013","1/1/2014",..: 224 224 911 129 129 897 897 897 897 897 ...
## $ Ship.Mode : Factor w/ 4 levels "First Class",..: 3 3 3 4 4 4 4 4 4 4 ...
## $ Customer.ID : Factor w/ 793 levels "AA-10315","AA-10375",..: 144 144 240 706 706 89 89 89 89 89 ...
## $ City : Factor w/ 531 levels "Aberdeen","Abilene",..: 195 195 267 154 154 267 267 267 267 267 ...
## $ State : Factor w/ 49 levels "Alabama","Arizona",..: 16 16 4 9 9 4 4 4 4 4 ...
## $ Postal.Code : int 42420 42420 90036 33311 33311 90032 90032 90032 90032 90032 ...
## $ Region : Factor w/ 4 levels "Central","East",..: 3 3 4 3 3 4 4 4 4 4 ...
## $ Product.ID : Factor w/ 1862 levels "FUR-BO-10000112",..: 13 56 947 320 1317 186 563 1762 795 438 ...
## $ Product.Name: Factor w/ 1850 levels "\"While you Were Out\" Message Book, One Form per Page",..: 387 833 1440 368 574 570 1137 1099 535 295 ...
## $ Segment : Factor w/ 3 levels "Consumer","Corporate",..: 1 1 2 1 1 1 1 1 1 1 ...
## $ Category : Factor w/ 3 levels "Furniture","Office Supplies",..: 1 1 2 1 2 1 2 3 2 2 ...
## $ Sub.Category: Factor w/ 17 levels "Accessories",..: 5 6 11 17 15 10 3 14 4 2 ...
## $ Revenue : num 262 731.9 14.6 957.6 22.4 ...
## $ Quantity : int 2 3 2 5 2 7 4 6 3 5 ...
## $ Discount : num 0 0 0 0.45 0.2 0 0 0.2 0.2 0 ...
## $ Profit : num 41.91 219.58 6.87 -383.03 2.52 ...
Comments
Consistent with above, factor variables have a lot of levels
Orders very neatly organized: Segment, then category, then sub category
Few quantitative variables to work with
No errors to be found
Questions + Which factor variables will contribute most to analysis?
-will it be purchase type or region, for example\
summary(om)
## Order.ID Order.Date Ship.Date
## CA-2015-100111: 14 9/6/2014 : 38 12/16/2013: 35
## CA-2015-157987: 12 9/3/2015 : 36 9/27/2015 : 34
## CA-2014-165330: 11 11/11/2014: 35 11/22/2015: 32
## US-2014-108504: 11 12/2/2015 : 34 12/7/2015 : 32
## CA-2013-131338: 10 12/3/2015 : 34 12/13/2015: 30
## CA-2014-105732: 10 12/10/2015: 33 9/16/2015 : 30
## (Other) :9926 (Other) :9784 (Other) :9801
## Ship.Mode Customer.ID City State
## First Class :1538 WB-21850: 37 New York City: 915 California :2001
## Same Day : 543 JL-15835: 34 Los Angeles : 747 New York :1128
## Second Class :1945 MA-17560: 34 Philadelphia : 537 Texas : 985
## Standard Class:5968 PP-18955: 34 San Francisco: 510 Pennsylvania: 587
## CK-12205: 32 Seattle : 428 Washington : 506
## EH-13765: 32 Houston : 377 Illinois : 492
## (Other) :9791 (Other) :6480 (Other) :4295
## Postal.Code Region Product.ID
## Min. : 1040 Central:2323 OFF-PA-10001970: 19
## 1st Qu.:23223 East :2848 TEC-AC-10003832: 18
## Median :56430 South :1620 FUR-FU-10004270: 16
## Mean :55190 West :3203 FUR-CH-10001146: 15
## 3rd Qu.:90008 FUR-CH-10002647: 15
## Max. :99301 TEC-AC-10002049: 15
## (Other) :9896
## Product.Name Segment Category
## Staple envelope : 48 Consumer :5191 Furniture :2121
## Easy-staple paper : 46 Corporate :3020 Office Supplies:6026
## Staples : 46 Home Office:1783 Technology :1847
## Avery Non-Stick Binders : 20
## Staples in misc. colors : 19
## KI Adjustable-Height Table: 18
## (Other) :9797
## Sub.Category Revenue Quantity Discount
## Binders :1523 Min. : 0.444 Min. : 1.00 Min. :0.0000
## Paper :1370 1st Qu.: 17.280 1st Qu.: 2.00 1st Qu.:0.0000
## Furnishings: 957 Median : 54.490 Median : 3.00 Median :0.2000
## Phones : 889 Mean : 229.858 Mean : 3.79 Mean :0.1562
## Storage : 846 3rd Qu.: 209.940 3rd Qu.: 5.00 3rd Qu.:0.2000
## Art : 796 Max. :22638.480 Max. :14.00 Max. :0.8000
## (Other) :3613
## Profit
## Min. :-6599.980
## 1st Qu.: 1.730
## Median : 8.665
## Mean : 28.657
## 3rd Qu.: 29.360
## Max. : 8399.980
##
Comments + Many categorical variables, will be able to look into very specific relationships between variables.
- Correlation between variables will be a good thing to look into
Quantitative variables:
-Revenue: extrememly right skewed. Mean is larger than median -Quantity: fairly normally distributed, even with a large maximum -Discount: Median is slightly higher than the mean, so fairly left skewed -Profit: Also right skewed. Mean is larger than median.
Median will be a less bias measure during the EDA
## Order.ID Order.Date Ship.Date Ship.Mode Customer.ID
## 1 CA-2014-152156 11/9/2014 11/12/2014 Second Class CG-12520
## 2 CA-2014-152156 11/9/2014 11/12/2014 Second Class CG-12520
## 3 CA-2014-138688 6/13/2014 6/17/2014 Second Class DV-13045
## 4 US-2013-108966 10/11/2013 10/18/2013 Standard Class SO-20335
## 5 US-2013-108966 10/11/2013 10/18/2013 Standard Class SO-20335
## 6 CA-2012-115812 6/9/2012 6/14/2012 Standard Class BH-11710
## City State Postal.Code Region Product.ID
## 1 Henderson Kentucky 42420 South FUR-BO-10001798
## 2 Henderson Kentucky 42420 South FUR-CH-10000454
## 3 Los Angeles California 90036 West OFF-LA-10000240
## 4 Fort Lauderdale Florida 33311 South FUR-TA-10000577
## 5 Fort Lauderdale Florida 33311 South OFF-ST-10000760
## 6 Los Angeles California 90032 West FUR-FU-10001487
## Product.Name Segment
## 1 Bush Somerset Collection Bookcase Consumer
## 2 Hon Deluxe Fabric Upholstered Stacking Chairs, Rounded Back Consumer
## 3 Self-Adhesive Address Labels for Typewriters by Universal Corporate
## 4 Bretford CR4500 Series Slim Rectangular Table Consumer
## 5 Eldon Fold 'N Roll Cart System Consumer
## 6 Eldon Expressions Wood and Plastic Desk Accessories, Cherry Wood Consumer
## Category Sub.Category Revenue Quantity Discount Profit
## 1 Furniture Bookcases 261.9600 2 0.00 41.91
## 2 Furniture Chairs 731.9400 3 0.00 219.58
## 3 Office Supplies Labels 14.6200 2 0.00 6.87
## 4 Furniture Tables 957.5775 5 0.45 -383.03
## 5 Office Supplies Storage 22.3680 2 0.20 2.52
## 6 Furniture Furnishings 48.8600 7 0.00 14.17
grid.arrange(
# Order ID
ggplot(data = om, mapping = aes(x = Order.ID)) +
geom_bar(),
# Order Date
ggplot(data = om, mapping = aes(x = Order.Date)) +
geom_bar(),
# Ship Date
ggplot(data = om, mapping = aes(x = Ship.Date)) +
geom_bar(),
#Ship. Mode
ggplot(data = om, mapping = aes(x = Ship.Mode)) +
geom_bar(),
ncol = 2
)
Comments
Order.ID, Order.Date and Ship.Date are fairly dense
Ship.mode sees the highest count for Standard Class,
-beats the second highest -Second Class- by nearly 4000
grid.arrange(
# Customeer ID
ggplot(data = om, mapping = aes(x = Customer.ID)) +
geom_bar(),
# City
ggplot(data = om, mapping = aes(x = City)) +
geom_bar(),
# State
ggplot(data = om, mapping = aes(x = State)) +
geom_bar(),
#Postal Code
ggplot(data = om, mapping = aes(x = Postal.Code)) +
geom_bar(),
#Region
ggplot(data = om, mapping = aes(x = Region)) +
geom_bar(),
ncol = 2
)
Comments
Customer ID seems to follow a regular pattern
City appears to have a few dominant levels
State, like city, appears to have some dominant levels
Postal code works in tandem with city, and state, therefore no surprise to see dominant levels
Regions are fairly equal in count
-The range between highest(W) and lowest (S) is only about 2000
Questions
Are these states/cities the highest contributers to profit and revenue?
-Do these locations buy more, or are they just exposed to a lot of discounts?
What are differences in products being sold by region that contributes to a high or low count?
-Is the South a region we need to focus our marketing efforts?
grid.arrange(
# Product.ID
ggplot(data = om, mapping = aes(x = Product.ID)) +
geom_bar(),
# Product.Name
ggplot(data = om, mapping = aes(x = Product.Name)) +
geom_bar(),
# Segment
ggplot(data = om, mapping = aes(x = Segment)) +
geom_bar(),
# Category
ggplot(data = om, mapping = aes(x = Category)) +
geom_bar(),
# Sub.Category
ggplot(data = om, mapping = aes(x = Sub.Category)) +
geom_bar(),
ncol = 2
)
Comments
There are a few products that far outperform the others
We see the most sales in the Consumer segment
-outperforms the second highest, corporate, by a little over 2000
We see the most sales of Office supplies
-outperforms the second highest, furniture, by nearly 4000
Sub category fluctuates
-would be interesting to take a closer look at which segments these subcategories belong to
###Revenue
#Use grid.arrange to put two different graphs together
grid.arrange(
#Create histogram
ggplot(data = om, mapping = aes(x = Revenue)) +
geom_histogram(),
#Add boxplot
ggplot(data = om, mapping = aes(x = 1)) +
geom_boxplot(mapping = aes(y = Revenue)) +
coord_flip(), #Use to have same x-axis on both graphs
#Set number of columns in grid.arrange
ncol = 1
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Comments
Revenue is fairly clustered
A few outliers for revenue expands the boxplot graph
# Use grid.arrange to put two different graphs together
grid.arrange(
#Create histogram
ggplot(data = om, mapping = aes(x = Quantity)) +
geom_histogram(),
#Add boxplot
ggplot(data = om, mapping = aes(x = 1)) +
geom_boxplot(mapping = aes(y = Quantity)) +
coord_flip(), #use to have same x-axis on both graphs
#Set number of columns in grid.arrange
ncol = 1
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Comments
Quantity is very left-skewed
-The mean is lower than the median
There are very few observations with a quantity greater than 10
# Use grid.arrange to put two different graphs together
grid.arrange(
#Create histogram
ggplot(data = om, mapping = aes(x = Discount)) +
geom_histogram(),
#Add boxplot
ggplot(data = om, mapping = aes(x = 1)) +
geom_boxplot(mapping = aes(y = Discount)) +
coord_flip(), #use to have same x-axis on both graphs
#Set number of columns in grid.arrange
ncol = 1
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Comments
Discount is left-skewed
-The mean is smaller than the median
A few outliers
We notice a big drop-off in discounts after 0.2
Questions
# Use grid.arrange to put two different graphs together
grid.arrange(
#Create histogram
ggplot(data = om, mapping = aes(x = Profit)) +
geom_histogram(),
#Add boxplot
ggplot(data = om, mapping = aes(x = 1)) +
geom_boxplot(mapping = aes(y = Profit)) +
coord_flip(), #use to have same x-axis on both graphs
#Set number of columns in grid.arrange
ncol = 1
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Comments
Profit is normally distributed
Centered around 0
A little higher on the postive side of 0 than the negative side, which is good
#Examine relationship between Region and Ship mode with a contingency table.
om %>%
tabyl(Region, Ship.Mode) %>% # creates table of counts
adorn_totals(where = c("row", "col")) # Total margins
## Region First Class Same Day Second Class Standard Class Total
## Central 299 120 465 1439 2323
## East 490 155 530 1673 2848
## South 234 83 329 974 1620
## West 515 185 621 1882 3203
## Total 1538 543 1945 5968 9994
Comments
Second class ship mode had the most ocurrances.
-Most likely second class is the most affordable option.West regions saw the most orders of everything.
-Confirms what was seen in graphical EDA
Questions
#proportion table for Region and Ship.Mode.
om %>%
tabyl(Region, Ship.Mode) %>%
adorn_totals(where = c("row", "col")) %>%
adorn_percentages(denominator = "all") %>% # creates proportions
adorn_rounding(2) # round decimals
## Region First Class Same Day Second Class Standard Class Total
## Central 0.03 0.01 0.05 0.14 0.23
## East 0.05 0.02 0.05 0.17 0.28
## South 0.02 0.01 0.03 0.10 0.16
## West 0.05 0.02 0.06 0.19 0.32
## Total 0.15 0.05 0.19 0.60 1.00
Comments
West regions makes up the largest proportion of shipments.
-nearly double the amount when compared to the south region.
Eastern region has the second largest proportion of shipments.
Standard Clas shipments is the most popular ship mode.
-60% of shipments are standard class -The next most popular sip mode only makes up for 19 % of shipments. -The least popular ship mode seems to be Same day at 5% (likely due to price).
#Now we look at the relationsHIp between segment and category:
#Proportion contingency/cross table for segment and category
om %>%
tabyl(Segment, Category) %>%
adorn_totals(where = c("row", "col")) %>%
adorn_percentages(denominator = "all") %>% # creates proportions
adorn_rounding(2) # round decimals
## Segment Furniture Office Supplies Technology Total
## Consumer 0.11 0.31 0.10 0.52
## Corporate 0.06 0.18 0.06 0.30
## Home Office 0.04 0.11 0.03 0.18
## Total 0.21 0.60 0.18 1.00
Comments
The consumer segment makes up about 50% or half of entire orders.
-Consumers are obviously the largest contributer to sales.
60% of orders are office supplies.
-Rhis makes sense as this is data from Office Max.
Questions
How does this finding fit into the categorical analysis found in Step 2?
-How can we use these data together to inform pricing/marketing strategy
# lets look at Region, ship.mode, and segment together:
#Proportion tables
om %>%
tabyl(Region, Ship.Mode, Segment) %>%
adorn_totals(where = c("row", "col")) %>%
adorn_percentages(denominator = "all") %>% # creates proportions
adorn_rounding(2) # round decimals
## $Consumer
## Region First Class Same Day Second Class Standard Class Total
## Central 0.03 0.02 0.04 0.15 0.23
## East 0.05 0.02 0.06 0.16 0.28
## South 0.02 0.01 0.03 0.10 0.16
## West 0.05 0.02 0.06 0.19 0.32
## Total 0.15 0.06 0.20 0.59 1.00
##
## $Corporate
## Region First Class Same Day Second Class Standard Class Total
## Central 0.03 0.00 0.05 0.14 0.22
## East 0.04 0.01 0.05 0.18 0.29
## South 0.03 0.00 0.04 0.10 0.17
## West 0.06 0.01 0.07 0.18 0.32
## Total 0.16 0.04 0.20 0.60 1.00
##
## $`Home Office`
## Region First Class Same Day Second Class Standard Class Total
## Central 0.03 0.01 0.06 0.14 0.25
## East 0.05 0.01 0.05 0.17 0.28
## South 0.02 0.01 0.02 0.10 0.15
## West 0.06 0.03 0.05 0.18 0.32
## Total 0.16 0.06 0.18 0.60 1.00
Comments
Proportion of shipments appear consistant based on Segment.
-For example, Cooperate sees higher proportion of orders
We see that consistantly office max is most succesful with shipments coming from the Western region.
-Derived from looking at the proption of shipments based on region in each segment -Almost double southern region in some cases
Questions
Do we need to focus marketing or sales efforts in the Southern region or even in particular states, in order to increase profitability?
-Specifically which relationships, and what data manipualtion do we need to work with to reveal answers?
om %>%
select_if(is.numeric) %>% # Use to select just the numeric variables
cor() %>%
round(3)
## Revenue Quantity Discount Profit
## Revenue 1.000 0.201 -0.028 0.479
## Quantity 0.201 1.000 0.009 0.066
## Discount -0.028 0.009 1.000 -0.219
## Profit 0.479 0.066 -0.219 1.000
Comments
Revenue and Profit have a strong correlation coefficient of 0.479.
Revenue and Quantity have a coefficient of 0.201.
-Makes sense themore you sell the ore profitable you become. -Especially large chains go for large number of sales to create revenue.
Discount and profit have a negetive correlation of -0.219.
-This makes sense, the more they discount the less profitable Office Max becomes.
Data shows consistant results to what we would expect.
-This is a solid sign that data and results are not off.
grid.arrange(
om %>%
ggplot(data = om, mapping = aes(x = Region, fill = Segment)) +
geom_bar(position = "dodge"),
om %>%
ggplot(data = om, mapping = aes(x = Region, fill = Category)) +
geom_bar(position = "dodge"),
om %>%
ggplot(data = om, mapping = aes(x = Region, fill = Ship.Mode)) +
geom_bar(position = "dodge"),
ncol = 1
)
Comments
This graph is consistent with what we’ve seen in the EDA thus far
-The South is consistently underperforming
The consumer segment has the most counts for each region
The office supplies category has the most counts for each region
Standard class has the most counts for each region
Questions
#more detail on region and segment
grid.arrange(
om %>%
ggplot(mapping = aes(x = Region, fill = Segment)) +
geom_bar(position = "dodge") +
coord_flip(),
om %>%
ggplot(mapping = aes(x = Region, fill = Segment)) +
geom_bar(position = "fill") +
coord_flip(),
ncol = 1
)
Comments
Questions
What contributes to poor performance in the South?
-Is poor performance due to lack of discount, product availability, for example?
#Note we have to calculate count and provide a variable for filling the tile graph
om %>%
group_by(Region, Segment) %>%
summarise(count = n()) %>%
ggplot(aes(Region, Segment)) +
geom_tile(aes(fill = -count))
Comments
om %>%
ggplot(mapping = aes(x = Quantity, y = Revenue)) +
geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Comments
grid.arrange(
om %>%
ggplot(mapping = aes(x = Discount, y = Quantity)) +
geom_smooth(se = FALSE),
om %>%
ggplot(mapping = aes(x = Discount, y = Quantity)) +
geom_point(),
ncol = 1
)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Comments
As the Discount increases, the Quantity per order increases
-Discount is more likely to inform Quantity
Questions
How does this translate to Quantity/Count in poor-performing regions?
Is Discount effective up until a certain point in increasing orders?
-Is there a limit to where a discount will no longer be effective.
grid.arrange(
om %>%
ggplot(mapping = aes(x = Quantity, y = Profit)) +
geom_smooth(se = FALSE),
om %>%
ggplot(mapping = aes(x = Quantity, y = Profit)) +
geom_point(),
ncol = 1
)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Comments
In general, no matter what the quantity is, the amount of profit remains about the same
-Revenues are informed by amount of orders rather than profit margins
#Discount and revenue
grid.arrange(
om %>%
ggplot(mapping = aes(x = Discount, y = Revenue)) +
geom_smooth(se = FALSE),
om %>%
ggplot(mapping = aes(x = Discount, y = Revenue)) +
geom_point(),
ncol = 1
)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Comments
We see that discount is beneficial for Office Max up until a certain point
-Around 0.5, discounts start to hinder OM rather than help OM.
#Discount and Profit
grid.arrange(
om %>%
ggplot(mapping = aes(x = Discount, y = Profit)) +
geom_smooth(se = FALSE),
om %>%
ggplot(mapping = aes(x = Discount, y = Profit)) +
geom_point(),
ncol = 1
)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Comments
Works in tandem with Revenue graph
Only up to a certain point will discounts contribute to an increase in profit for OM
# Region, Revenue, Segment
om%>%
ggplot(mapping = aes(x = Discount, y = Profit, color = Segment)) +
geom_point() +
geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Comments
All segments seem to follow the same trend
-As discount increases, contribution to profit decreases
We will investigate this further in our statistical eda
#Discount, Profit, Region
om %>%
ggplot(mapping = aes(x = Discount, y = Profit, color = Region)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
Comments
For each region, as discount increases, profit decreases
Works in tandem with the above graph, filled by Segment
#Discount, Revenue, Region
om %>%
ggplot(mapping = aes(x = Discount, y = Revenue, color = Region)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
Comments
For each region, as discount increases, contribution to revenue decreases
Follows the same trend we’ve seen thus far
# Show code
grid.arrange(
om %>%
ggplot(mapping = aes(x = Region, y = Revenue)) +
geom_boxplot(),
om %>%
ggplot(mapping = aes(x = Region, y = Profit)) +
geom_boxplot(),
ncol = 1
)
Comments
This graph shows that the Revenue and Profit patterns are consistent by region
-Regardless of count/number of sales, all the regions observe similar trends
cities_botm_10 <- om %>%
group_by(City) %>%
summarise(total_rev = sum(Revenue)) %>%
top_n(10, -total_rev) %>%
ggplot(mapping = aes(x = reorder(City, -total_rev), y = total_rev, fill = total_rev)) +
coord_flip() + geom_bar(stat = "identity")
cities_botm_10
Comments
These are the cities that see the lowest amount of revenue
While this is nice to know, it’s too specific to pursue in detailed EDA
states_top_10 <- om %>%
group_by(State) %>%
summarise(total_rev = sum(Revenue)) %>%
top_n(10, total_rev) %>%
ggplot(mapping = aes(x = reorder(State, total_rev), y = total_rev/1000, fill -total_rev)) +
coord_flip() +
geom_bar(stat = "identity")
states_top_10
Comments
This graph confirms what was discovered before in the EDA by region
This information is nice to know, but not worth pursuing in detailed EDA
#Most profitable states and least profitable states
states_top_profit <- om %>%
group_by(State) %>%
summarise(total_profit = sum(Profit)) %>%
top_n(10, total_profit) %>%
ggplot(mapping = aes(x = reorder(State, total_profit), y = total_profit, fill = -total_profit)) + ggtitle("Highest Performing States") +
coord_flip() +
geom_bar(stat = "identity")
states_top_profit
states_lowest_profit <- om %>%
group_by(State) %>%
summarise(total_profit = sum(Profit)) %>%
top_n(10, -total_profit) %>%
ggplot(mapping = aes(x = reorder(State, total_profit), y = total_profit, fill = -total_profit)) +
ggtitle(" Lowest Performing States") +
coord_flip() +
geom_bar(stat = "identity")
states_lowest_profit
Comments
When paired with the graph before, it’s interersting to see that Texas brings in the third most revenue, but suffers huge profit loss
None of the higest performing states are in the south
How does discount in each category affect profit?
om %>%
ggplot(mapping = aes(x = Region, y = Revenue, fill = Segment)) +
geom_bar(stat = "identity", position = "dodge")
Comments
Consistent with earlier findings, Central and South seeem to have high revenue than the other states.
The South has high home-office sales compared to others
-Almost disproportionately large
Questions
Why do the Central and South regions have such high revenues?
-We need to look at discount.
#Density of price
om %>%
ggplot(mapping= aes(x = Region, fill = Segment)) +
geom_density(alpha = 0.5)
om %>%
ggplot(mapping = aes(x = Region, y = Profit/100, fill = Segment)) +
geom_boxplot()
#average price
om %>%
group_by(Region, Segment) %>%
summarize(mean_profit_000 = mean(Profit)/1000) %>%
ggplot(mapping = aes(x = Region, y = mean_profit_000, fill = Segment)) +
geom_bar(stat = "identity", position = "dodge")
#Average price comparison
Comments
# look at most profitable state and which segments profits come from.
om %>%
group_by(State, Segment) %>%
summarise(total_profit = sum(Profit)) %>%
top_n(10, -total_profit) %>%
ggplot(mapping = aes(x = reorder(State, total_profit), y = total_profit, fill = Segment)) +
coord_flip() +
geom_bar(stat = "identity")
Comments
#let's look at why certain states are making more, less, or no profit at all based on how much dicount they give.
om %>%
group_by(State, Segment) %>%
top_n(10, Discount) %>%
ggplot(mapping = aes(x = reorder(State, Discount), y = Discount, fill = Segment)) +
coord_flip() +
geom_bar(stat = "identity")
Comments
#Mean discount by region
om %>%
group_by(Region, Segment) %>%
summarise(mean_discount = mean(Discount)) %>% #create mean discount level for graph
top_n(10, mean_discount) %>%
ggplot(mapping = aes(x = reorder(Region, mean_discount), y = mean_discount, fill = Segment)) + #reorder to go in descending order
coord_flip() +
geom_bar(stat = "identity", position = "dodge")
Comments
Questions
om %>%
ggplot(mapping = aes(x = Discount, y = Revenue)) +
geom_point(mapping = aes(color = Category))
Comments
Questions
# lets look at of dicount afffect profits
om <- om %>%
mutate(Disc = ifelse(Discount > 0, 1, 0))
t.test(om$Profit[om$Disc == 1], om$Profit[om$Disc == 0])
##
## Welch Two Sample t-test
##
## data: om$Profit[om$Disc == 1] and om$Profit[om$Disc == 0]
## t = -15.738, df = 9162.2, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -82.71926 -64.39557
## sample estimates:
## mean of x mean of y
## -6.657061 66.900350
Comments
The mean profit when a discount occurs is significantly different than the mean profit of 66.90 when no discount occurs.
Very samll p-value also confirms that there is a statistically significant(reliable) difference between profits when there is a discount or no discount present at the time of order.
Questions
#Look at dicount and profit
pairwise.t.test(om$Profit, as.factor(om$Discount))
##
## Pairwise comparisons using t tests with pooled SD
##
## data: om$Profit and as.factor(om$Discount)
##
## 0 0.1 0.15 0.2 0.3 0.32 0.4 0.45 0.5
## 0.1 1.00000 - - - - - - - -
## 0.15 1.00000 1.00000 - - - - - - -
## 0.2 1.7e-15 0.07897 1.00000 - - - - - -
## 0.3 1.5e-11 1.6e-05 0.73216 0.00025 - - - - -
## 0.32 0.01326 0.00730 0.66145 0.25531 1.00000 - - - -
## 0.4 < 2e-16 1.0e-11 0.00306 2.8e-15 0.07546 1.00000 - - -
## 0.45 0.00076 0.00035 0.02414 0.00866 0.25531 1.00000 1.00000 - -
## 0.5 < 2e-16 < 2e-16 6.0e-14 < 2e-16 4.4e-15 0.00076 2.9e-08 1.00000 -
## 0.6 9.6e-07 0.00020 1.00000 0.01909 1.00000 1.00000 0.16380 0.25531 2.0e-13
## 0.7 < 2e-16 7.5e-12 0.00815 < 2e-16 0.19829 1.00000 1.00000 1.00000 4.8e-11
## 0.8 < 2e-16 9.4e-12 0.00588 < 2e-16 0.14412 1.00000 1.00000 1.00000 6.6e-10
## 0.6 0.7
## 0.1 - -
## 0.15 - -
## 0.2 - -
## 0.3 - -
## 0.32 - -
## 0.4 - -
## 0.45 - -
## 0.5 - -
## 0.6 - -
## 0.7 0.39329 -
## 0.8 0.27461 1.00000
##
## P value adjustment method: holm
z <- qnorm(0.975)
om %>%
group_by(Discount) %>%
summarise(mean_prof = mean(Profit), sd = sd(Profit), n = n(), ci = z * sd/sqrt(n)) %>%
ggplot(mapping = aes(x = Discount, y = mean_prof)) +
geom_bar(stat = "identity") +
geom_errorbar(aes(ymin =mean_prof - ci, ymax =mean_prof + ci),
width = 0.1)
Comments + In the table we see that column three of profit that there is a statistically significant difference between profit levels at a 20 percent discount and a 30 percent discount.
-We see that the p value in column 2 of about 0.000016 which is very small. Very statistically significant.
-This means we reject the nul hypothesis that thre is no difference in mean profits depending on percentage discounted.
The bar chart does the same visually, we see that the error bars the difference between profitability between a 20 percent dicount and 30 percent discount is both statistically significant as well as practically different.
-The error bars do not over lap as well as the bars show profits going from postive profits to negetive profit levels
There seems to be a sweet spot where we saw earlier that dicout ussually increase revenue but not alwasy profits. Here it shows that potentially Omcan discount up to 20 percent to icnrease revenues without suffering from significant profit loss.
#revenue and discount pairwise t test
pairwise.t.test(om$Revenue, om$Discount)
##
## Pairwise comparisons using t tests with pooled SD
##
## data: om$Revenue and om$Discount
##
## 0 0.1 0.15 0.2 0.3 0.32 0.4 0.45 0.5
## 0.1 2.0e-06 - - - - - - - -
## 0.15 0.01387 1.00000 - - - - - - -
## 0.2 1.00000 4.5e-07 0.00677 - - - - - -
## 0.3 2.3e-06 1.00000 1.00000 2.7e-07 - - - - -
## 0.32 0.24332 1.00000 1.00000 0.16287 1.00000 - - - -
## 0.4 6.7e-13 1.00000 1.00000 4.1e-14 1.00000 1.00000 - - -
## 0.45 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 - -
## 0.5 < 2e-16 0.04545 0.04545 < 2e-16 1.6e-05 0.29438 0.00619 1.00000 -
## 0.6 0.02476 6.2e-09 6.6e-05 0.07415 4.8e-08 0.00611 1.3e-12 0.48076 < 2e-16
## 0.7 0.00149 4.0e-10 7.4e-05 0.01410 1.0e-10 0.01121 < 2e-16 0.75047 < 2e-16
## 0.8 0.00014 4.2e-11 1.4e-05 0.00149 1.1e-11 0.00400 < 2e-16 0.48076 < 2e-16
## 0.6 0.7
## 0.1 - -
## 0.15 - -
## 0.2 - -
## 0.3 - -
## 0.32 - -
## 0.4 - -
## 0.45 - -
## 0.5 - -
## 0.6 - -
## 0.7 1.00000 -
## 0.8 1.00000 1.00000
##
## P value adjustment method: holm
m <- qnorm(0.975)
om %>%
group_by(Discount) %>%
summarise(mean_rev = mean(Revenue), sd = sd(Revenue), n = n(), ci = z * sd/sqrt(n)) %>%
ggplot(mapping = aes(x = Discount, y = mean_rev)) +
geom_bar(stat = "identity") +
geom_errorbar(aes(ymin =mean_rev - ci, ymax =mean_rev + ci),
width = 0.2)
Comments
In the table we see that a 50% discount compared to a 60, 70, 0r 80% dicount was significantly different in resulting revenue levels. At the 50% discout levels of revenue are increasing, hwoever after this threshold revenues drops significantly. We see this thorugh th every low p values in the table surroudn the 50% discount level.
We see that the p value are very small.
We reject the null hypothesis that there is no statisitally significant difference in revenue levels at these different discount levels.
the bar chart shows the same visually.
The lowest performing regions are South and Central. We looked at their levels of discount to find the reason for this.
High discounts looked appealing when looking at revenues only.
-The South and Central regions did not appear to be underperforming when only looking at revenue, which is very misleading.
mean_discount <- om %>%
group_by(Region, Segment) %>%
summarise(mean_discount = mean(Discount)) %>%
top_n(10, mean_discount) %>%
ggplot(mapping = aes(x = reorder(Region, mean_discount), y = mean_discount, fill = Segment)) +
coord_flip() +
geom_bar(stat = "identity", position = "dodge") +
theme_classic() +
ggtitle("Mean Discount is highest for the Central and Southern Regions",
subtitle = "These regions were also the lowest performers for mean profit.") +
labs(x = "", y = "Average Discount", fill = "") +
scale_fill_discrete_sequential(palette = "RedOr")
mean_discount
mean_profit<- om %>%
group_by(Region, Segment) %>%
summarize(mean_profit_000 = mean(Profit)) %>%
ggplot(mapping = aes(x = reorder(Region, mean_profit_000), y = mean_profit_000, fill = Segment)) +
geom_bar(stat = "identity", position = "dodge") +
ggtitle("Profit is the lowest for the Central and Southern Regions",
subtitle = "These regions saw the highest amount of discounts") +
coord_flip() +
theme_classic() +
labs(x = "", y = "Average Profit", fill = "") +
scale_y_continuous(label = dollar_format()) +
scale_fill_discrete_sequential(palette = "RedOr")
mean_profit
revenue_discount <- om %>%
group_by(Discount, Region) %>%
summarise(mean_revenue = mean(Revenue)) %>%
ggplot(mapping = aes(x = Discount, y = mean_revenue, color = Region)) +
geom_smooth(se = FALSE) +
theme_classic() +
ggtitle("The South's high discounts influence high revenues",
subtitle = "This is a double-edged sword when paired with profit data") +
labs(x = "", y = "", color = "") +
scale_y_continuous(label = dollar_format()) +
scale_color_discrete_sequential(palette = "RedOr")
revenue_discount
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.0035
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.2035
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 0.25351
ggsave(filename = "mean_discount.png", plot = mean_discount)
## Saving 7 x 5 in image
ggsave(filename = "mean_profit.png", plot = mean_profit)
## Saving 7 x 5 in image
ggsave(filename = "revenue_discount.png", plot = revenue_discount)
## Saving 7 x 5 in image
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.0035
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.2035
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 0.25351